# ───────────────────────────────────────────────────────────────────────────────
# 0. Install & load required packages
# ───────────────────────────────────────────────────────────────────────────────
pkgs <- c("sf", "dplyr", "readr", "tmap", "spdep", "tigris", "stringr")
to_install <- pkgs[!sapply(pkgs, requireNamespace, quietly = TRUE)]
if(length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")
lapply(pkgs, library, character.only = TRUE)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
## [[1]]
## [1] "sf" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "dplyr" "sf" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "readr" "dplyr" "sf" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "tmap" "readr" "dplyr" "sf" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "spdep" "spData" "tmap" "readr" "dplyr" "sf"
## [7] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [13] "base"
##
## [[6]]
## [1] "tigris" "spdep" "spData" "tmap" "readr" "dplyr"
## [7] "sf" "stats" "graphics" "grDevices" "utils" "datasets"
## [13] "methods" "base"
##
## [[7]]
## [1] "stringr" "tigris" "spdep" "spData" "tmap" "readr"
## [7] "dplyr" "sf" "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base"
options(tigris_use_cache = TRUE)
# ───────────────────────────────────────────────────────────────────────────────
# 1. Read your DMV attribute table & standardize FIPS
# ───────────────────────────────────────────────────────────────────────────────
dmv_data <- read_csv(
"/Users/g35441498/Desktop/spatial_statistics_final_project-main/dmv_data.csv",
show_col_types = FALSE
) %>%
mutate(fips = str_pad(as.character(fips), width = 11, pad = "0"))
# ───────────────────────────────────────────────────────────────────────────────
# 2. Download & bind Census tract boundaries for DC, MD, VA
# ───────────────────────────────────────────────────────────────────────────────
dc_tracts <- tracts(state = "DC", year = 2022, cb = TRUE)
md_tracts <- tracts(state = "MD", year = 2022, cb = TRUE)
va_tracts <- tracts(state = "VA", year = 2022, cb = TRUE)
tracts <- bind_rows(dc_tracts, md_tracts, va_tracts) %>%
mutate(GEOID = str_pad(GEOID, width = 11, pad = "0"))
# ───────────────────────────────────────────────────────────────────────────────
# 3. Join attributes onto tract geometries
# ───────────────────────────────────────────────────────────────────────────────
dmv_sf <- tracts %>%
left_join(dmv_data, by = c("GEOID" = "fips"))
# ───────────────────────────────────────────────────────────────────────────────
# 4. Filter & map asthma prevalence
# ───────────────────────────────────────────────────────────────────────────────
dmv_sf_clean <- dmv_sf %>% filter(!is.na(asthma_prev))
tmap_mode("plot")
## ℹ tmap mode set to "plot".
tm_shape(dmv_sf_clean) +
tm_polygons(
"asthma_prev",
palette = "Reds",
style = "quantile",
border.alpha = 0.4,
title = "Asthma Prev (%)"
) +
tm_layout(title = "Asthma Prevalence in DMV Tracts")
##
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_polygons()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
## 'tm_scale_intervals(<HERE>)'[v3->v4] `tm_polygons()`: use `col_alpha` instead of `border.alpha`.[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.
# ───────────────────────────────────────────────────────────────────────────────
# 5. Build Queen contiguity neighbors & spatial weights
# ───────────────────────────────────────────────────────────────────────────────
nb <- poly2nb(dmv_sf_clean, queen = TRUE)
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)
# 6. Plot tract boundaries + neighbor links
plot(
st_geometry(dmv_sf_clean),
border = "lightgrey", reset = FALSE,
main = "DMV Tracts & Queen Neighbors"
)
centroids <- st_coordinates(st_centroid(dmv_sf_clean))
## Warning: st_centroid assumes attributes are constant over geometries
plot(nb, coords = centroids, add = TRUE, col = "blue")
# Global Moran’s I test for asthma prevalence
moran_res <- spdep::moran.test(
dmv_sf_clean$asthma_prev,
lw,
zero.policy = TRUE
)
print(moran_res)
##
## Moran I test under randomisation
##
## data: dmv_sf_clean$asthma_prev
## weights: lw
##
## Moran I statistic standard deviate = 74.68, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 7.139356e-01 -2.611648e-04 9.145934e-05
# Moran scatterplot
spdep::moran.plot(
dmv_sf_clean$asthma_prev,
lw,
zero.policy = TRUE,
labels = FALSE,
pch = 16,
main = "Moran Scatterplot: Asthma Prevalence"
)
# Interpretation:
# - Look at Moran’s I statistic and its p-value in moran_res.
# - A significantly positive Moran’s I (p < 0.05) indicates spatial clustering
# of high (or low) asthma prevalence, justifying spatial regression models.
#Step 3: variables choosing
# ---- 0. Packages ------------------------------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom) # tidy() for model output
library(psych) # describe(), corr.test()
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggcorrplot) # prettier correlation heat-map
# -----------------------------------------------------------------
# 0) Define variable groups for clarity & easy sub-setting
# -----------------------------------------------------------------
## 0a. Pollution & physical environment
vars_pollute <- c("pm2.5", "ozone", "diesel", "road_proximity",
"percent_in_flood_zone") # flood exposure
## 0b. Socio-economic status (SES & financial stress)
vars_ses <- c("housing_cost_burden", "poverty", "unemployment")
## 0c. Education & language barriers
vars_edu_lang <- c("no_highschool", "limited_english")
## 0d. Household crowding & caregiving
vars_household <- c("crowded_housing", "single_parent")
## 0e. Age structure
vars_age <- c("age_under_17", "age_over_65")
## 0f. Racial/ethnic composition
vars_race <- c("minority") # can add more like 'black', 'hispanic' if needed later
## 0g. Outcome
var_outcome <- "asthma_prev"
# ---------------------------------------------------------------
# Combine everything into ONE master vector for the model step
# ---------------------------------------------------------------
vars_all <- c(var_outcome,
vars_pollute,
vars_ses,
vars_edu_lang,
vars_household,
vars_age,
vars_race)
# ---------------------------------------------------------------
# 1) Subset and drop rows with any missing values in vars_all
# ---------------------------------------------------------------
model_df <- dmv_data %>%
dplyr::select(all_of(vars_all)) %>%
tidyr::drop_na() # geometry stays intact if dmv_data is an sf object
## 2a. Spearman correlations with asthma_prev -------------------------------
cont_vars <- setdiff(names(model_df), "asthma_prev")
cor_out <- psych::corr.test(model_df[, c("asthma_prev", cont_vars)],
method = "spearman", adjust = "BH")
# neat heat-map (upper triangle) -------------------------------------------
ggcorrplot(cor_out$r,
hc.order = TRUE, # cluster similar variables
type = "upper",
lab = TRUE,
title = "Spearman correlations with asthma prevalence")
# ---- Packages ---------------------------------------------------------------
if (!require(car)) install.packages("car")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(car) # for vif()
# ---- Trimmed-variable linear model -----------------------------------------
lm_trim <- lm(
asthma_prev ~ pm2.5 + ozone + diesel + road_proximity +
percent_in_flood_zone +
housing_cost_burden + poverty + unemployment +
no_highschool + limited_english +
crowded_housing + single_parent +
age_under_17 + age_over_65,
data = dmv_data
)
# ---- Re-estimate VIFs -------------------------------------------------------
vif_vals <- vif(lm_trim)
print(vif_vals)
## pm2.5 ozone diesel
## 3.795824 2.138744 3.189733
## road_proximity percent_in_flood_zone housing_cost_burden
## 1.450347 1.158879 2.704069
## poverty unemployment no_highschool
## 2.895095 1.405538 2.225353
## limited_english crowded_housing single_parent
## 2.395978 1.946115 2.055992
## age_under_17 age_over_65
## 1.828245 1.382131
# optional: flag any values > 4
vif_vals[vif_vals > 4]
## named numeric(0)
library(dplyr)
library(spdep)
library(tidyr)
# -------------------------------------------------------------------
# 1) Define variables for the final OLS / spatial-diagnostic workflow
# -------------------------------------------------------------------
vars_selected <- c(
"asthma_prev",
"pm2.5", "ozone", "road_proximity", "percent_in_flood_zone",
"poverty", "crowded_housing", "limited_english",
"age_under_17", "minority"
)
# -------------------------------------------------------------------
# 2) Keep only tracts with complete data on vars
# -------------------------------------------------------------------
model_data <- dmv_sf_clean %>%
dplyr::select(all_of(vars_selected)) %>% # keep geometry
tidyr::drop_na()
# -------------------------------------------------------------------
# 3) Build neighbors & spatial weights on the SAME subset
# -------------------------------------------------------------------
nb_mod <- spdep::poly2nb(model_data, queen = TRUE)
## Warning in spdep::poly2nb(model_data, queen = TRUE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in spdep::poly2nb(model_data, queen = TRUE): neighbour object has 16 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
lw_mod <- spdep::nb2listw(nb_mod, style = "W", zero.policy = TRUE)
# -------------------------------------------------------------------
# 4) Fit ordinary least-squares (OLS) model
# -------------------------------------------------------------------
ols_model <- lm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data
)
summary(ols_model) # inspect coefficients
##
## Call:
## lm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
## poverty + crowded_housing + limited_english + age_under_17 +
## minority, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4385 -0.5649 -0.0220 0.5356 5.5714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.125e+01 4.019e-01 27.984 < 2e-16 ***
## pm2.5 -1.529e-01 4.737e-02 -3.229 0.00126 **
## ozone 1.686e-01 1.747e-02 9.652 < 2e-16 ***
## road_proximity -5.431e-04 5.034e-04 -1.079 0.28076
## percent_in_flood_zone -4.908e-04 1.517e-03 -0.324 0.74624
## poverty 1.782e-03 4.532e-05 39.318 < 2e-16 ***
## crowded_housing -1.317e-04 5.042e-04 -0.261 0.79395
## limited_english -2.268e-03 1.338e-04 -16.949 < 2e-16 ***
## age_under_17 -9.877e-04 4.753e-05 -20.778 < 2e-16 ***
## minority 2.607e-04 1.849e-05 14.103 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9432 on 2509 degrees of freedom
## Multiple R-squared: 0.514, Adjusted R-squared: 0.5123
## F-statistic: 294.9 on 9 and 2509 DF, p-value: < 2.2e-16
# -------------------------------------------------------------------
# 5) Attach residuals to the sf data frame
# -------------------------------------------------------------------
model_data$resid_ols <- residuals(ols_model)
# -------------------------------------------------------------------
# 6) Global Moran’s I test on the OLS residuals
# -------------------------------------------------------------------
resid_moran <- spdep::moran.test(
model_data$resid_ols,
lw_mod,
zero.policy = TRUE
)
print(resid_moran)
##
## Moran I test under randomisation
##
## data: model_data$resid_ols
## weights: lw_mod
## n reduced by no-neighbour observations
##
## Moran I statistic standard deviate = 34.266, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.4634956078 -0.0003984064 0.0001832732
# -------------------------------------------------------------------
# 7) Moran scatterplot of OLS residuals
# -------------------------------------------------------------------
spdep::moran.plot(
model_data$resid_ols,
lw_mod,
zero.policy = TRUE,
labels = FALSE,
pch = 16,
main = "Moran Scatterplot: OLS Residuals"
)
library(spatialreg) # for lagsarlm() and errorsarlm()
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
library(spdep) # for moran.test()
# ───────────────────────────────────────────────────────────────────────────────
# 4.1 Spatial Lag Model (SLM)
# ───────────────────────────────────────────────────────────────────────────────
slm_model <- lagsarlm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data,
listw = lw_mod,
zero.policy = TRUE
)
summary(slm_model)
##
## Call:lagsarlm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity +
## percent_in_flood_zone + poverty + crowded_housing + limited_english +
## age_under_17 + minority, data = model_data, listw = lw_mod,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.887119 -0.419133 -0.040171 0.363403 6.753178
##
## Type: lag
## Regions with no neighbours included:
## 18 267 381 569 781 929 1785 2138
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.9620e+00 3.7195e-01 16.0288 < 2.2e-16
## pm2.5 -6.5100e-02 3.8691e-02 -1.6825 0.09246
## ozone 1.0794e-01 1.4364e-02 7.5143 5.729e-14
## road_proximity -8.9647e-04 4.0981e-04 -2.1875 0.02870
## percent_in_flood_zone -1.6111e-03 1.2331e-03 -1.3065 0.19137
## poverty 1.3158e-03 3.9117e-05 33.6387 < 2.2e-16
## crowded_housing 2.9583e-04 4.1002e-04 0.7215 0.47059
## limited_english -1.4121e-03 1.1042e-04 -12.7884 < 2.2e-16
## age_under_17 -6.1044e-04 4.0211e-05 -15.1807 < 2.2e-16
## minority 1.5874e-04 1.5246e-05 10.4117 < 2.2e-16
##
## Rho: 0.44343, LR test value: 910.94, p-value: < 2.22e-16
## Asymptotic standard error: 0.014186
## z-value: 31.259, p-value: < 2.22e-16
## Wald statistic: 977.13, p-value: < 2.22e-16
##
## Log likelihood: -2966.586 for lag model
## ML residual variance (sigma squared): 0.58819, (sigma: 0.76694)
## Number of observations: 2519
## Number of parameters estimated: 12
## AIC: 5957.2, (AIC for lm: 6866.1)
## LM test for residual autocorrelation
## test value: 113.82, p-value: < 2.22e-16
# ───────────────────────────────────────────────────────────────────────────────
# 4.2 Spatial Error Model (SEM)
# ───────────────────────────────────────────────────────────────────────────────
sem_modelel <- errorsarlm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data,
listw = lw_mod,
zero.policy = TRUE
)
summary(sem_modelel)
##
## Call:errorsarlm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity +
## percent_in_flood_zone + poverty + crowded_housing + limited_english +
## age_under_17 + minority, data = model_data, listw = lw_mod,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.238607 -0.359284 -0.042705 0.300107 6.130692
##
## Type: error
## Regions with no neighbours included:
## 18 267 381 569 781 929 1785 2138
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1581e+01 8.4766e-01 13.6625 < 2.2e-16
## pm2.5 -1.7084e-01 9.9495e-02 -1.7171 0.085958
## ozone 1.1450e-01 3.5950e-02 3.1850 0.001447
## road_proximity 3.1556e-04 5.4295e-04 0.5812 0.561105
## percent_in_flood_zone -2.3233e-03 1.4176e-03 -1.6388 0.101247
## poverty 1.1221e-03 3.8071e-05 29.4738 < 2.2e-16
## crowded_housing 7.2374e-04 3.5048e-04 2.0650 0.038922
## limited_english -7.7913e-04 1.1817e-04 -6.5931 4.306e-11
## age_under_17 -5.0756e-04 4.1256e-05 -12.3025 < 2.2e-16
## minority 1.1000e-04 1.9291e-05 5.7020 1.184e-08
##
## Lambda: 0.74488, LR test value: 1265.8, p-value: < 2.22e-16
## Asymptotic standard error: 0.013995
## z-value: 53.226, p-value: < 2.22e-16
## Wald statistic: 2833, p-value: < 2.22e-16
##
## Log likelihood: -2789.149 for error model
## ML residual variance (sigma squared): 0.45401, (sigma: 0.6738)
## Number of observations: 2519
## Number of parameters estimated: 12
## AIC: 5602.3, (AIC for lm: 6866.1)
# ───────────────────────────────────────────────────────────────────────────────
# 4.3 Compare AIC across models
# ───────────────────────────────────────────────────────────────────────────────
aic_ols <- AIC(ols_model)
aic_slm <- AIC(slm_model)
aic_sem <- AIC(sem_modelel)
cat("AIC - OLS:", aic_ols, "\n",
"AIC - Spatial Lag:", aic_slm, "\n",
"AIC - Spatial Error:", aic_sem, "\n")
## AIC - OLS: 6866.108
## AIC - Spatial Lag: 5957.172
## AIC - Spatial Error: 5602.297
# ───────────────────────────────────────────────────────────────────────────────
# 4.4 Compute Pseudo-R² for each model
# ───────────────────────────────────────────────────────────────────────────────
pseudoR2 <- function(resid, y) {
sse <- sum(resid^2)
sst <- sum((y - mean(y))^2)
1 - (sse / sst)
}
r2_ols <- pseudoR2(residuals(ols_model), model_data$asthma_prev)
r2_slm <- pseudoR2(residuals(slm_model), model_data$asthma_prev)
r2_sem <- pseudoR2(residuals(sem_modelel), model_data$asthma_prev)
cat("Pseudo-R² - OLS:", r2_ols, "\n",
"Pseudo-R² - Spatial Lag:", r2_slm, "\n",
"Pseudo-R² - Spatial Error:", r2_sem, "\n")
## Pseudo-R² - OLS: 0.5140215
## Pseudo-R² - Spatial Lag: 0.6774207
## Pseudo-R² - Spatial Error: 0.7510107
# ───────────────────────────────────────────────────────────────────────────────
# 4.5 Moran’s I on residuals for each model
# ───────────────────────────────────────────────────────────────────────────────
moran_ols <- moran.test(residuals(ols_model), lw_mod, zero.policy = TRUE)
moran_slm <- moran.test(residuals(slm_model), lw_mod, zero.policy = TRUE)
moran_sem <- moran.test(residuals(sem_modelel), lw_mod, zero.policy = TRUE)
cat("Moran's I p-value - OLS Residuals:", moran_ols$p.value, "\n",
"Moran's I p-value - Lag Residuals:", moran_slm$p.value, "\n",
"Moran's I p-value - Error Residuals:", moran_sem$p.value, "\n")
## Moran's I p-value - OLS Residuals: 1.240044e-257
## Moran's I p-value - Lag Residuals: 1.227379e-17
## Moran's I p-value - Error Residuals: 1
#Step 6 Car model
library(spdep) # spautolm() for CAR
library(spatialreg) # moran.test()
# ── 1·1 Fit the CAR model (Gaussian response) ────────────────────────────────
car_model <- spautolm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data,
listw = lw_mod,
family = "CAR", # <-- conditional autoregressive
zero.policy = TRUE
)
## Warning in spautolm(asthma_prev ~ pm2.5 + ozone + road_proximity +
## percent_in_flood_zone + : Non-symmetric spatial weights in CAR model
## Warning in sqrt(fdHess[1, 1]): NaNs produced
summary(car_model)
##
## Call: spautolm(formula = asthma_prev ~ pm2.5 + ozone + road_proximity +
## percent_in_flood_zone + poverty + crowded_housing + limited_english +
## age_under_17 + minority, data = model_data, listw = lw_mod,
## family = "CAR", zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.964024 -0.441604 -0.025786 0.377974 5.879373
##
## Regions with no neighbours included:
## 18 267 381 569 781 929 1785 2138
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.3276e+00 5.4330e-01 17.1687 < 2.2e-16
## pm2.5 4.4551e-02 6.4223e-02 0.6937 0.4878701
## ozone 5.6472e-02 2.4136e-02 2.3398 0.0192963
## road_proximity -3.9400e-03 5.6148e-04 -7.0172 2.264e-12
## percent_in_flood_zone 1.4221e-02 1.6010e-03 8.8822 < 2.2e-16
## poverty 1.3377e-03 4.5656e-05 29.2997 < 2.2e-16
## crowded_housing -2.3843e-04 4.5277e-04 -0.5266 0.5984644
## limited_english -2.0298e-03 1.3684e-04 -14.8329 < 2.2e-16
## age_under_17 -1.6299e-04 4.8268e-05 -3.3767 0.0007336
## minority 2.0228e-04 2.0867e-05 9.6942 < 2.2e-16
##
## Lambda: 0.65445 LR test value: 488.12 p-value: < 2.22e-16
## Numerical Hessian standard error of lambda: NaN
##
## Log likelihood: -3177.995
## ML residual variance (sigma squared): 0.68789, (sigma: 0.82939)
## Number of observations: 2519
## Number of parameters estimated: 12
## AIC: 6380
# ── 1·2 Compare information criteria (lower = better) ────────────────────────
AICvals <- c(
OLS = AIC(ols_model),
SLM = AIC(slm_model),
SEM = AIC(sem_modelel),
CAR = AIC(car_model)
)
print(round(AICvals, 1))
## OLS SLM SEM CAR
## 6866.1 5957.2 5602.3 6380.0
# ── 1·3 Pseudo-R² for CAR (same formula as before) ───────────────────────────
pseudoR2 <- function(resid, y) 1 - sum(resid^2)/sum((y - mean(y))^2)
r2_car <- pseudoR2(residuals(car_model), model_data$asthma_prev)
print(r2_car)
## [1] 0.6834214
# ── 1·4 Residual Moran’s I to check remaining autocorrelation ────────────────
moran_car <- moran.test(residuals(car_model), lw_mod, zero.policy = TRUE)
cat("CAR residual Moran p-value:", moran_car$p.value, "\n")
## CAR residual Moran p-value: 0.997895
#Step7: SDEM model
library(spatialreg)
library(Matrix)
# ───────────────────────────────────────────────────────────────────────────────
# 1. Fit SEM & SDEM
# ───────────────────────────────────────────────────────────────────────────────
sem_model <- errorsarlm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data,
listw = lw_mod,
Durbin = FALSE,
zero.policy = TRUE
)
sdem_model <- errorsarlm(
asthma_prev ~ pm2.5 + ozone + road_proximity + percent_in_flood_zone +
poverty + crowded_housing + limited_english +
age_under_17 + minority,
data = model_data,
listw = lw_mod,
Durbin = TRUE,
zero.policy = TRUE
)
# ───────────────────────────────────────────────────────────────────────────────
# 2. Likelihood-ratio test (SDEM vs SEM)
# ───────────────────────────────────────────────────────────────────────────────
# Perform likelihood-ratio test
lr_res <- anova(sdem_model, sem_model)
print(lr_res)
## Model df AIC logLik Test L.Ratio p-value
## sdem_model 1 21 5455.8 -2706.9 1
## sem_model 2 12 5602.3 -2789.2 2 164.48 0
# ───────────────────────────────────────────────────────────────────────────────
# 3. Hausman test (with fallback to pseudo-inverse)
# ───────────────────────────────────────────────────────────────────────────────
haus_fun <- function(big, small, tol = 1e-10) {
out <- tryCatch(
spatialreg::Hausman.test(big, small, tol = tol),
error = function(e) {
# Manual computation if Hausman.test fails
β_big <- coef(big)
β_small <- coef(small)[names(β_big)] # Align coefficients
V_big <- vcov(big)
V_small <- vcov(small)[names(β_big), names(β_big)]
V_diff <- V_small - V_big # Note: Order matters (V_small - V_big)
# Use pseudo-inverse if matrix is near-singular
V_diff_inv <- try(solve(V_diff), silent = TRUE)
if (inherits(V_diff_inv, "try-error")) {
V_diff_inv <- Matrix::ginv(V_diff)
}
q_diff <- β_big - β_small
chi <- t(q_diff) %*% V_diff_inv %*% q_diff
p_val <- pchisq(chi, df = length(β_big), lower.tail = FALSE)
list(statistic = chi, parameter = length(β_big), p.value = p_val)
}
)
return(out)
}
haus_res <- haus_fun(sdem_model, sem_model)
## Error in solve.default((Vo - Vs), tol = tol) :
## system is computationally singular: reciprocal condition number = 1.39971e-11
## Warning in Hausman.test.Sarlm(big, small, tol = tol): (Vo - Vs) inversion
## failure
# Get log-likelihoods
ll_sdem <- as.numeric(logLik(sdem_model))
ll_sem <- as.numeric(logLik(sem_model))
# Calculate test statistic
LR_stat <- 2 * (ll_sdem - ll_sem)
df_diff <- length(coef(sdem_model)) - length(coef(sem_model))
# Get p-value
p_value <- 1 - pchisq(LR_stat, df_diff)
# Create results
lr_res <- list(
statistic = LR_stat,
parameter = df_diff,
p.value = p_value,
method = "Likelihood ratio test for nested spatial models"
)
class(lr_res) <- "htest"
print(lr_res)
##
## Likelihood ratio test for nested spatial models
##
## data:
## = 164.48, = 9, p-value < 2.2e-16
# ───────────────────────────────────────────────────────────────────────────────
# 4. Model comparison table
# ───────────────────────────────────────────────────────────────────────────────
haus_p <- as.numeric(haus_res$p.value) # force numeric
lr_p <- as.numeric(lr_res$p.value) # you did this already
comp <- data.frame(
Model = c("SEM", "SDEM"),
AIC = c(AIC(sem_model), AIC(sdem_model)),
LogLik = c(logLik(sem_model), logLik(sdem_model)),
LR_p = c(NA, round(lr_p, 4)),
Hausman_p = c(NA, round(haus_p, 4))
)
print(comp, row.names = FALSE)
## Model AIC LogLik LR_p Hausman_p
## SEM 5602.297 -2789.149 NA NA
## SDEM 5455.814 -2706.907 0 NA
cat("\nDecision rules:\n",
"• LR_p < 0.05 ⇒ SDEM fits significantly better → prefer SDEM.\n",
"• Hausman_p < 0.05 ⇒ SEM is inconsistent → prefer SDEM.\n",
"• If both p > 0.05 and SEM has lower AIC, prefer SEM.\n")
##
## Decision rules:
## • LR_p < 0.05 ⇒ SDEM fits significantly better → prefer SDEM.
## • Hausman_p < 0.05 ⇒ SEM is inconsistent → prefer SDEM.
## • If both p > 0.05 and SEM has lower AIC, prefer SEM.
# ───────────────────────────────────────────────────────────────────────────────
# STEP 4 : Compare all candidate models and pick the “winner”
# ───────────────────────────────────────────────────────────────────────────────
# Assumes you already have these fitted objects in memory:
# lm_mod – ordinary least-squares
# slm_model – spatial lag model
# sem_model – spatial error model (or sem_modelel2 in earlier code)
# sdem_model – spatial Durbin-error model
# car_model – conditional autoregressive model (optional; omit if not fitted)
# 4·1 Gather and rank by AIC
models <- list(
OLS = ols_model,
SLM = slm_model,
SEM = sem_model,
SDEM = sdem_model,
CAR = car_model # remove this line if you did not fit CAR
)
aic_tbl <- data.frame(
Model = names(models),
AIC = sapply(models, AIC, USE.NAMES = FALSE),
LogLik= sapply(models, logLik, USE.NAMES = FALSE)
) |>
dplyr::arrange(AIC)
print(aic_tbl, row.names = FALSE)
## Model AIC LogLik
## SDEM 5455.814 -2706.907
## SEM 5602.297 -2789.149
## SLM 5957.172 -2966.586
## CAR 6379.991 -3177.995
## OLS 6866.108 -3422.054
#Step 8: visualization
##residual
# Attach predicted values and residuals from SEM model
model_data$pred_sem <- predict(sem_model)
## This method assumes the response is known - see manual page
model_data$resid_sem <- residuals(sem_model)
library(tmap)
tmap_mode("plot")
## ℹ tmap mode set to "plot".
# ---------------------------------------------------------------
# 1 & 2. Observed and Predicted maps side-by-side
# ---------------------------------------------------------------
tmap_arrange(
tm_shape(model_data) +
tm_polygons("asthma_prev", palette = "Reds", title = "Observed Asthma") +
tm_layout(title = "Observed Asthma"),
tm_shape(model_data) +
tm_polygons("pred_sem", palette = "Blues", title = "Predicted (SEM)") +
tm_layout(title = "Predicted (SEM)"),
ncol = 2
)
##
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_tm_polygons()`: migrate the argument(s) related to the scale of
## the visual variable `fill` namely 'palette' (rename to 'values') to fill.scale
## = tm_scale(<HERE>).[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`[cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Reds" is named
## "brewer.reds"Multiple palettes called "reds" found: "brewer.reds", "matplotlib.reds". The first one, "brewer.reds", is returned.
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "Blues" is named
## "brewer.blues"Multiple palettes called "blues" found: "brewer.blues", "matplotlib.blues". The first one, "brewer.blues", is returned.
# ---------------------------------------------------------------
# 3. Residuals map
# ---------------------------------------------------------------
tm_shape(model_data) +
tm_polygons("resid_sem", palette = "-RdBu", style = "quantile",
title = "Residuals (SEM)") +
tm_layout(title = "Residuals (SEM)")
##
## ── tmap v3 code detected ───────────────────────────────────────────────────────
## [v3->v4] `tm_polygons()`: instead of `style = "quantile"`, use fill.scale =
## `tm_scale_intervals()`.
## ℹ Migrate the argument(s) 'style', 'palette' (rename to 'values') to
## 'tm_scale_intervals(<HERE>)'[v3->v4] `tm_polygons()`: migrate the argument(s) related to the legend of the
## visual variable `fill` namely 'title' to 'fill.legend = tm_legend(<HERE>)'[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`Multiple palettes called "rd_bu" found: "brewer.rd_bu", "matplotlib.rd_bu". The first one, "brewer.rd_bu", is returned.
## [scale] tm_polygons:() the data variable assigned to 'fill' contains positive and negative values, so midpoint is set to 0. Set 'midpoint = NA' in 'fill.scale = tm_scale_intervals(<HERE>)' to use all visual values (e.g. colors)
## [cols4all] color palettes: use palettes from the R package cols4all. Run
## `cols4all::c4a_gui()` to explore them. The old palette name "-RdBu" is named
## "rd_bu" (in long format "brewer.rd_bu")
##model comparison
# 6 Coefficient dot plot (direct effects only)
library(broom)
coef_df <- tidy(sdem_model, conf.int = TRUE)
ggplot(coef_df, aes(estimate, term)) +
geom_pointrange(aes(xmin=conf.low, xmax=conf.high)) +
geom_vline(xintercept = 0, linetype="dotted") +
labs(x="Estimate (95% CI)", y=NULL, title="SDEM Coefficients") +
theme_bw()
# 7 AIC bar plot
ggplot(aic_tbl, aes(reorder(Model, AIC), AIC)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = round(AIC,1)), vjust = -0.3) +
labs(y="AIC", x=NULL, title="Model Fit Comparison (lower is better)") +
theme_minimal()